1   package org.apache.lucene.analysis.ar;
2   
3   
4   /*
5    * Licensed to the Apache Software Foundation (ASF) under one or more
6    * contributor license agreements.  See the NOTICE file distributed with
7    * this work for additional information regarding copyright ownership.
8    * The ASF licenses this file to You under the Apache License, Version 2.0
9    * (the "License"); you may not use this file except in compliance with
10   * the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  import static org.apache.lucene.analysis.util.StemmerUtil.*;
22  
23  /**
24   *  Stemmer for Arabic.
25   *  <p>
26   *  Stemming  is done in-place for efficiency, operating on a termbuffer.
27   *  <p>
28   *  Stemming is defined as:
29   *  <ul>
30   *  <li> Removal of attached definite article, conjunction, and prepositions.
31   *  <li> Stemming of common suffixes.
32   * </ul>
33   *
34   */
35  public class ArabicStemmer {
36    public static final char ALEF = '\u0627';
37    public static final char BEH = '\u0628';
38    public static final char TEH_MARBUTA = '\u0629';
39    public static final char TEH = '\u062A';
40    public static final char FEH = '\u0641';
41    public static final char KAF = '\u0643';
42    public static final char LAM = '\u0644';
43    public static final char NOON = '\u0646';
44    public static final char HEH = '\u0647';
45    public static final char WAW = '\u0648';
46    public static final char YEH = '\u064A';
47    
48    public static final char prefixes[][] = {
49        ("" + ALEF + LAM).toCharArray(), 
50        ("" + WAW + ALEF + LAM).toCharArray(), 
51        ("" + BEH + ALEF + LAM).toCharArray(),
52        ("" + KAF + ALEF + LAM).toCharArray(),
53        ("" + FEH + ALEF + LAM).toCharArray(),
54        ("" + LAM + LAM).toCharArray(),
55        ("" + WAW).toCharArray(),
56    };
57    
58    public static final char suffixes[][] = {
59      ("" + HEH + ALEF).toCharArray(), 
60      ("" + ALEF + NOON).toCharArray(), 
61      ("" + ALEF + TEH).toCharArray(), 
62      ("" + WAW + NOON).toCharArray(), 
63      ("" + YEH + NOON).toCharArray(), 
64      ("" + YEH + HEH).toCharArray(),
65      ("" + YEH + TEH_MARBUTA).toCharArray(),
66      ("" + HEH).toCharArray(),
67      ("" + TEH_MARBUTA).toCharArray(),
68      ("" + YEH).toCharArray(),
69  };
70    
71    /**
72     * Stem an input buffer of Arabic text.
73     * 
74     * @param s input buffer
75     * @param len length of input buffer
76     * @return length of input buffer after normalization
77     */
78    public int stem(char s[], int len) {
79      len = stemPrefix(s, len);
80      len = stemSuffix(s, len);
81      
82      return len;
83    }
84    
85    /**
86     * Stem a prefix off an Arabic word.
87     * @param s input buffer
88     * @param len length of input buffer
89     * @return new length of input buffer after stemming.
90     */
91    public int stemPrefix(char s[], int len) {
92      for (int i = 0; i < prefixes.length; i++) 
93        if (startsWithCheckLength(s, len, prefixes[i]))
94          return deleteN(s, 0, len, prefixes[i].length);
95      return len;
96    }
97  
98    /**
99     * Stem suffix(es) off an Arabic word.
100    * @param s input buffer
101    * @param len length of input buffer
102    * @return new length of input buffer after stemming
103    */
104   public int stemSuffix(char s[], int len) {
105     for (int i = 0; i < suffixes.length; i++) 
106       if (endsWithCheckLength(s, len, suffixes[i]))
107         len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
108     return len;
109   }
110   
111   /**
112    * Returns true if the prefix matches and can be stemmed
113    * @param s input buffer
114    * @param len length of input buffer
115    * @param prefix prefix to check
116    * @return true if the prefix matches and can be stemmed
117    */
118   boolean startsWithCheckLength(char s[], int len, char prefix[]) {
119     if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
120       return false;
121     } else if (len < prefix.length + 2) { // other prefixes require only 2.
122       return false;
123     } else {
124       for (int i = 0; i < prefix.length; i++)
125         if (s[i] != prefix[i])
126           return false;
127         
128       return true;
129     }
130   }
131   
132   /**
133    * Returns true if the suffix matches and can be stemmed
134    * @param s input buffer
135    * @param len length of input buffer
136    * @param suffix suffix to check
137    * @return true if the suffix matches and can be stemmed
138    */
139   boolean endsWithCheckLength(char s[], int len, char suffix[]) {
140     if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
141       return false;
142     } else {
143       for (int i = 0; i < suffix.length; i++)
144         if (s[len - suffix.length + i] != suffix[i])
145           return false;
146         
147       return true;
148     }
149   }  
150 }